import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
http://archive.ics.uci.edu/ml/datasets/Forest+Fires
http://www3.dsi.uminho.pt/pcortez/forestfires/
Forest fire data from the Montesinhonatural park, from the Tras-os-Montes northeast region of Portugal. This park contains a high floraand fauna diversity. Inserted within a supra-Mediterranean climate, the average annualtemperature is within the range 8 to 12◦C. The data used in the experiments was collected from January 2000 to December 2003 and it was built using two sources. The first database was collected by the inspector that was responsible for the Montesinhofire occurrences. At a daily basis, every time a forest fire occurred, several features were registered, such as the time, date, spatial location within a 9×9 grid, the type of vegetation involved, the six components of the FWI systemand the total burned area. The second database was collectedby the Braganc ̧a Poly-technic Institute, containing several weather observations (e.g. wind speed) that were recorded with a 30 minute period by a meteorological stationlocated in the center of the Montesinho park. The two databases were stored in tens of individual spread-sheets, under distinct formats, and a substantial manual effort was performed to integrate them into a single dataset with a total of 517 entries.
dataset_name = 'forestfire'
file_path = '~/data/forest-fires/'
file_name = 'forestfires.csv'
file = file_path + file_name
df = pd.read_csv(file)
df.shape
df.info()
# convert month to numeric value
df['month'] = pd.to_datetime(df['month'], format='%b').dt.month
# convert day to numeric value
# abbreviated days aren't recognized by datetime so needed to create a mapping
d = {'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5, 'sat':6, 'sun':7}
df['day'] = df['day'].map(d)
df.head()
df.describe()
# check dataframe for bad data
df.isnull().any() | df.isna().any()
(df.isnull() | df.isna()).sum()
# get the feature names to make plotting easier
feature_names = list(df.columns.values)
feature_names.remove('area')
# add a burned_area label based on the area value (to make labeling plots easier)
df['burned_area'] = df['area'] > 0
# get the number of samples for area > 0
num_burned_area = df[df['area'] > 0].shape[0]
num_non_burned_area = df[df['area'] == 0].shape[0]
print('Samples with burned area: ', num_burned_area)
print('Samples without burned area: ', num_non_burned_area)
sns.countplot(x='burned_area', data=df)
plt.savefig(dataset_name + '_countplot.png')
plt.show()
num_plots = len(feature_names)
num_columns = 4
num_rows = math.ceil(num_plots/num_columns) # round up
x_value = 'burned_area'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,22))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.boxplot(x=x_value, y=name, data=df, ax=axs[index], boxprops=dict(alpha=.9))
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.boxplot(x=x_value, y=name, data=df, ax=axs[row][col], boxprops=dict(alpha=.9))
plt.savefig(dataset_name + '_boxplots.png')
plt.show()
num_plots = len(feature_names)
num_columns = 4
num_rows = math.ceil(num_plots/num_columns) # round up
x_value = 'burned_area'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,22))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.violinplot(x=x_value, y=name, data=df, ax=axs[index])
sns.swarmplot(x=x_value, y=name, data=df, ax=axs[index], color='k', size=3);
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.violinplot(x=x_value, y=name, data=df, ax=axs[row][col])
sns.swarmplot(x=x_value, y=name, data=df, ax=axs[row][col], color='k', size=3);
plt.savefig(dataset_name + '_violinplots.png')
plt.show()
num_plots = len(feature_names)
num_columns = 3
num_rows = math.ceil(num_plots/num_columns) # round up
label = 'area'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(18,16))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.distplot(df[df[label]==0][name], ax=axs[index])
sns.distplot(df[df[label]>0][name], ax=axs[index])
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.distplot(df[df[label]==0][name], ax=axs[row][col])
sns.distplot(df[df[label]>0][name], ax=axs[row][col])
plt.legend(['not burned','burned'])
plt.savefig(dataset_name + '_distplots_by_burned_area.png')
plt.show()
# distribution plot without factoring in burned area
num_plots = len(feature_names)
num_columns = 3
num_rows = int(num_plots/num_columns + num_plots%num_columns) # round up
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(18,16))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.distplot(df[name], ax=axs[index])
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.distplot(df[name], ax=axs[row][col])
plt.savefig(dataset_name + '_distplots.png')
plt.show()
# compute pairwise correlation of the attributes
corr = df.corr()
corr
fig, (ax) = plt.subplots(1, 1, figsize=(12,8))
hm = sns.heatmap(corr,
ax=ax, # Axes in which to draw the plot, otherwise use the currently-active Axes.
cmap="coolwarm", # Color Map.
#square=True, # If True, set the Axes aspect to “equal” so each cell will be square-shaped.
annot=True,
fmt='.2f', # String formatting code to use when adding annotations.
#annot_kws={"size": 14},
linewidths=.05)
fig.subplots_adjust(top=0.93)
fig.suptitle(dataset_name + ' attributes correlation heatmap', fontsize=14, fontweight='bold')
plt.savefig(dataset_name + '_heatmap.png')
plt.show()
pp = sns.pairplot(df, height=2, aspect=1.2, hue='burned_area', diag_kind='hist',
plot_kws = {'alpha': 0.5, 's': 40, 'edgecolor': 'k'})
fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle(dataset_name + ' attributes pairwise plot', fontsize=14, fontweight='bold')
plt.savefig(dataset_name + '_pairplot_hist.png')
plt.show()
pp = sns.pairplot(df, height=2, aspect=1.2,
plot_kws=dict(edgecolor="k", linewidth=0.5),
diag_kws=dict(shade=True), # "diag" adjusts/tunes the diagonal plots
diag_kind="kde") # use "kde" for diagonal plots
fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle(dataset_name + ' attributes pairwise plot', fontsize=14, fontweight='bold')
plt.savefig(dataset_name + '_pairplot.png')
plt.show()